#delim cr
set more off
set varabbrev off
pause on
graph set ps logo off

if !(`1' >= 1 & `1' <= 4) {
	kaboom
}

local step = `1'

capture log close
set linesize 255
set logtype text
log using ../log/run-quantile-regressions-`step'.log , replace

/* --------------------------------------

AUTHOR: Tal Gross

PURPOSE: Look at how air-quality affects
the workers at CTRIP.

DATE CREATED: June 12, 2014

NOTES:

--------------------------------------- */

clear all
estimates clear
set matsize 10000
describe, short

************************************************************
**   Programs Used Below                          
************************************************************

capture program drop tabresults
program tabresults
	**** disp "Readable Output:  "
	**** esttab * , keep( `1' ) ///
	**** stats(r2 N, fmt(%9.3f %-12.0gc)) ///
	**** cells(b( fmt(%9.4f)) se( par(( )) fmt(%9.4f)) p(par([ ]) fmt(%9.4f)) ) replace  

	**** disp "Tab-separated numbers for Excel:"
	**** estout * , keep( `1' ) ///
	**** stats(r2 N, fmt(%9.6f %9.0g)) ///   
	**** cells(b(nostar fmt(%9.4f)) se(fmt(%9.4f) nostar) p(fmt(%9.4f) )) ///
	**** style(tab) replace type mlabels(, nonumbers )

	**** disp "Raw numbers for Excel:"
	**** estout * , keep( `1' ) ///
	**** stats(r2 N, fmt(%9.6f %9.0g)) ///   
	**** cells(b(nostar fmt(%9.6f)) se(fmt(%9.6f) nostar) p(fmt(%9.6f) )) ///
	**** style(fixed) replace type mlabels(, nonumbers )

	disp "Formatted Tables:"
	estout * , keep( `1' ) ///
	stats(r2 N, fmt(%9.6f %9.0g)) ///   
	cells(b(star fmt(%9.4f)) se(fmt(%9.4f) par([ ]) nostar) ) ///
	style(fixed) replace type starlevels(* 0.05 ** 0.01)

	estimates clear
end

** We use this code to create temperature bins

capture program drop make_weather_bins
program make_weather_bins

	disp "Temperature range in Shanghai:"
	sum ltemp if shanghai_prod_sample == 1
	disp "Temperature range in Nantong:"
	sum ltemp if nantong_prod_sample == 1
	** Create temperature bins interacted with city indicators...
	gen ltemp_m_40 = (ltemp >= -100 & ltemp < 40)
	gen ltemp_40_50 = (ltemp >= 40 & ltemp < 50)
	gen ltemp_50_60 = (ltemp >= 50 & ltemp < 60)
	gen ltemp_60_70 = (ltemp >= 60 & ltemp < 70)
	gen ltemp_70_80 = (ltemp >= 70 & ltemp < 80)
	gen ltemp_80_p = (ltemp >= 80 & ltemp < .)

	foreach var in ltemp_m_40 ltemp_40_50 ltemp_50_60 ltemp_60_70 ltemp_70_80 ltemp_80_p {
		gen sh_`var' = shanghai_prod_sample * `var'
		gen na_`var' = nantong_prod_sample * `var'
	}
end

if "`step'" != "4" {
************************************************************
**   Bring in all our weather data
************************************************************

** We take only the weather data from this dataset:
use ../dta/all-our-environmental-data.dta , clear

d, f

keep Date ///
nantong_temp nantong_shuang_api_xx nantong_shuang_api_pm10 ///
precipitation tmax_cel tmin_cel tmax_fah tmin_fah visib temp dewp slp stp wdsp mxspd prcp frshtt 

tempfile weather
save `weather'

************************************************************
**   Bring in the key pollutants
************************************************************

use ../dta/imputed-pollution-measures.dta , clear

d, f

tempfile our_pollutants
save `our_pollutants'

************************************************************
**   Bring in the CTRIP data
************************************************************

use ../dta/ctrip-cleaned-daily-records.dta , clear

d, f
sum

************************************************************
**   Clean outcomes
************************************************************

sum LogInLengthsecond CallLengthsecond NumCall

gen calls_per_minute = NumCall / (LogInLengthsecond / 60)
gen log_calls_per_minute = log(calls_per_minute)
sum log_calls_per_minute calls_per_minute

gen calls_per_phone_minutes = NumCall / (CallLengthsecond / 60)
gen log_calls_per_phone_minutes = log(calls_per_phone_minutes)

gen log_ph_min_per_call = log( CallLengthsecond / NumCall )
sum log_ph_min_per_call

gen minutes_on_phone = CallLengthsecond / 60
gen log_minutes_on_phone = log(CallLengthsecond / 60)
gen log_minutes_logged = log(LogInLengthsecond / 60)
sum log_minutes_logged LogInLengthsecond

gen log_NumCall = log(NumCall)
sum log_NumCall NumCall

gen log_waiting_time = log(LogInLengthsecond - CallLengthsecond)
sum log_waiting_time

gen waiting_ratio = (LogInLengthsecond - CallLengthsecond) / LogInLengthsecond 
gen log_waiting_ratio = log(waiting_ratio)
sum waiting_ratio log_waiting_ratio
centile waiting_ratio , c(1 5 10 20 30 40 50 60 70 80 90 95 99)

** We label workers as late if they show up after 10 AM
gen byte late_to_work = (hour1 >= 10) 
replace late_to_work = . if hour1 == .
sum late_to_work

gen log_grosswage = log(grosswage)
sum log_grosswage grosswage

** New outcomes to capture mechanical effect
gen log_minphone_minlog = log( CallLengthsecond / LogInLengthsecond )
gen log_calls_minlog = log( NumCall / LogInLengthsecond)
sum log_*

************************************************************
**   Describe variation in hotel versus flight
************************************************************

** We were told that workers are permanently put on a "flight"
** or "hotel" team. But there appears to be some switching.
preserve

	** RESTRICTIONS
	** only controls and chose-not-to-volunteer groups
	** everything before end of experiment
	tab expgroup
	*keep if expgroup == 0 | expgroup == 1 | expgroup == 2

	collapse (mean) hotel (first) expgroup, by(EmployeeID) fast

	count if hotel != 0 & hotel != 1 & hotel != . & expgroup == 0
	count if hotel != 0 & hotel != 1 & hotel != . & expgroup == 1
	count if hotel != 0 & hotel != 1 & hotel != . & expgroup == 2
	count if hotel != 0 & hotel != 1 & hotel != . & expgroup == .

	sum hotel
	centile hotel , c(1 2 3 4 5 10 20 30 40 50 60 70 80 90 95 96 97 98 99)
restore

** We create a variable to mark those who switch 
gen byte flight = 1 - hotel
egen max_hotel = max(hotel) , by(EmployeeID)
egen max_flight = max(flight) , by(EmployeeID)
gen byte hotel_flight_switcher = (max_hotel == max_flight == 1)
sum max_hotel max_flight hotel_flight_switcher
table expgroup , c(mean hotel_flight_switcher)
drop max_hotel max_flight flight

************************************************************
**   Clean the time-clock variables
************************************************************

** Note that, for a sub-sample of observations, we have time-card
** data. We study those outcomes here.
gen timeclock_hours = (hour2 - hour1) + ((minute2 - minute1) / 60)
sum timeclock_hours

gen byte timeclock_hours_missing = (timeclock_hours == .)

table expgroup , c(mean timeclock_hours_missing)

sum timeclock_hours* 

sum timeclock_hours* 
table yearmonth , c(mean timeclock_hours N timeclock_hours)
table expgroup , c(mean timeclock_hours N timeclock_hours )

** The time-clock variables are problematic for several reasons.
** Many observations seem to have longer logged-in times than 
** time-clock hours
gen hours_logged_in = LogInLengthsecond / 60 / 60
compare timeclock_hours hours_logged_in
centile timeclock_hours hours_logged_in, c(10 20 30 40 50 60 70 80 90 95 99)

************************************************************
**   Merge CTRIP data to weather and pollution
************************************************************

merge m:1 Date using `weather'
drop _merge 

merge m:1 Date using `our_pollutants'
drop _merge 

************************************************************
**   Describe minutes logged in versus NumCall
************************************************************

** There appear to be days when there are no calls but many minutes
** logged in.
preserve

	** RESTRICTIONS
	** only controls and chose-not-to-volunteer groups
	** everything before end of experiment
	keep if expgroup == 0 | expgroup == 2
	keep if Date <= mdy(8, 14, 2011) 

	sum NumCall CallLengthsecond LogInLengthsecond 
	count if NumCall == . & LogInLengthsecond != .
	count if NumCall == 0 & LogInLengthsecond != 0

	qui reg NumCall junjie_api_xx hotel tmax_fah precipitation 
	disp "Number of observations: `e(N)'"
	gen in_numcall_sample = e(sample)
	qui reg LogInLengthsecond junjie_api_xx hotel tmax_fah precipitation 
	disp "Number of observations: `e(N)'"
	gen in_login_sample = e(sample)

	tab in_numcall_sample in_login_sample
	tab Date if in_numcall_sample == 0 & in_login_sample == 1
	tab EmployeeID if in_numcall_sample == 0 & in_login_sample == 1

restore

************************************************************
**   Clean the environmental variables
************************************************************

** Harmonize temperature across cities, so both are in Fahrenheit
sum nantong_temp tmax_fah
replace nantong_temp = 32 + (9 / 5) * nantong_temp
sum nantong_temp tmax_fah

** Squared temperature
gen tmax_fah2 = tmax_fah ^ 2
gen nantong_temp2 = nantong_temp ^ 2

sum junjie_api_xx pm10 pm25 pm25_imp
table yearmonth , c(mean junjie_api_xx mean pm10 mean pm25 mean pm25_imp)

** NOTE: To keep pollution point estimates interpretable, we divide by 10 here
sum junjie_api_xx pm10 pm25 pm25_imp
foreach var in junjie_api_xx pm10 pm25 pm25_imp nantong_shuang_api_pm10 nantong_shuang_api_xx {
	replace `var' = `var' / 10
}
sum junjie_api_xx pm10 pm25 pm25_imp

************************************************************
**   Clean commute variable
************************************************************

sum commute
sum commute , det

gen byte commute90 = (commute >= 90 & commute < .)
gen byte commute120 = (commute >= 120 & commute < .)

bysort EmployeeID: gen byte first_ob = (_n == 1)
centile commute if first_ob == 1 & commute != ., c(10 25 33 50 66 75 90)
centile commute if first_ob == 1 & commute != ., c(33 66)
local lower = r(c_1)
local upper = r(c_2)
gen commute_tercile = 1 * (commute >= 0 & commute < `lower') + 2 * (commute >= `lower' & commute < `upper') + 3 * (commute >= `upper' & commute < .)
drop first_ob
gen byte commute_tercile1 = (commute_tercile == 1)
gen byte commute_tercile2 = (commute_tercile == 2)
gen byte commute_tercile3 = (commute_tercile == 3)

************************************************************
**   Tom's adjustment
************************************************************

** Before adding this, nantong_shuang_api_pm10 was restricted
** to be above 50.

replace nantong_shuang_api_pm10 = nantong_shuang_api_xx if nantong_shuang_api_pm10==.

************************************************************
**   Create leads and lags of the pollution variables
************************************************************

preserve

	bysort Date: keep if _n == 1

	sort Date

	list Date junjie_api_xx in 1/50

	foreach pollutant in junjie_api_xx pm10 pm25 pm25_imp nantong_shuang_api_pm10 {
		forvalues i = 1/5 {
			gen `pollutant'_m`i' = `pollutant'[_n - `i']
			gen `pollutant'_p`i' = `pollutant'[_n + `i']

			** If there's a gap in the dates, then we replace the lead or 
			** lag to be missing.
			replace `pollutant'_m`i' = . if Date[_n - `i'] != Date - `i' 
			replace `pollutant'_p`i' = . if Date[_n + `i'] != Date + `i' 
		}
	}

	list Date junjie_api_xx junjie_api_xx_p1 junjie_api_xx_m1 junjie_api_xx_m2 in 1/50

	keep Date *_m1 *_m2 *_m3 *_m4 *_m5 *_p1 *_p2 *_p3 *_p4 *_p5

	tempfile lead_lags
	save `lead_lags'
restore

merge m:1 Date using `lead_lags' 
drop _merge

************************************************************
**   Restrictions
************************************************************

drop if hotel_flight_switcher

** Before just dropping those with no calls, I flag problematic
** workers. This is a varaible we use below. The issue is that relatively
** few of the workers we have take many calls.
gen byte problematic_calls = (NumCall == . | NumCall == 0)
egen problem_share = mean(problematic_calls) , by(EmployeeID)
sum problem_share
sum problem_share , det
drop problematic_calls

drop if NumCall == 0 | NumCall == .

** NOTE: we have further restrictions based on 
** (1) date
** (2) experimental group
** in each section below.

************************************************************
**   Create identifiers for a clean sample
************************************************************

gen byte shanghai_prod_sample = 0
replace shanghai_prod_sample = 1 if (expgroup == 0 | expgroup == 1 | expgroup == 2) & (Date <= mdy(8, 14, 2011) ) 
replace shanghai_prod_sample = 0 if minutes_on_phone == 0 | junjie_api_xx == . | LogInLengthsecond == . | LogInLengthsecond == 0 | NumCall == .
count if shanghai_prod_sample == 1

gen byte shanghai_home_exp_sample = 0
replace shanghai_home_exp_sample = 1 if (Date >= mdy(12, 6, 2010) & Date <= mdy(8, 14, 2011) ) & (expgroup == 0 | expgroup == 1) & junjie_api_xx != .
count if shanghai_home_exp_sample == 1

gen byte nantong_prod_sample = 0
replace nantong_prod_sample = 1 if (first_initial == "N") & nantong_shuang_api_pm10 != .
replace nantong_prod_sample = 0 if minutes_on_phone == . | minutes_on_phone == 0 | nantong_shuang_api_pm10 == . | LogInLengthsecond == 0 | LogInLengthsecond == . | NumCall == . | nantong_temp == . 
count if nantong_prod_sample == 1

** One check here:
count if first_initial == "N"
codebook Date if first_initial == "N"
codebook Date if first_initial == "N" & nantong_prod_sample != .

************************************************************
**   Prepare Fixed Effects
************************************************************

** The following code is easier if we break apart the fixed
** effects as follows...

capture program drop create_fes
program define create_fes

	** We now prepare the fixed effects...
	gen ym = 100 * year(Date) + month(Date)
	sum ym
	qui levelsof ym , local(cells)
	foreach cell of local cells {
		disp "Now on value `cell'"
		gen byte ym_`cell'  = (ym == `cell')
	}
	tab day_of_week
	forvalues i = 0/6 {
		gen byte dow_`i' = (day_of_week == `i')
	}
end

create_fes

capture program drop take_out_worker_fe
program define take_out_worker_fe
	** Now we have partial out the employee fixed effects
	foreach var in ///
	hotel ltemp ltemp2 ///
	ym_201001 ym_201002 ym_201003 ym_201004 ym_201005 ym_201006 ym_201007 ym_201008 ym_201009 ym_201010 ym_201011 ym_201012 ym_201101 ym_201102 ym_201103 ym_201104 ym_201105 ym_201106 ym_201107 ym_201108 ym_201109 ym_201110 ym_201111 ym_201112 ym_201201 ym_201202 ym_201203 ym_201204 ym_201205 ym_201206 ym_201207 ym_201208 ym_201209 ym_201210 ym_201211 ym_201212 ///
	dow_0 dow_1 dow_2 dow_3 dow_4 dow_5 dow_6 ///
	`1' {
		disp "Now on variable `var'"
		qui areg `var' , a(EmployeeID) 
		predict r_`var' , resid
	}
end

************************************************************
**   Create lists of key variables
************************************************************

local pollutants = "junjie_api_xx pm10 pm25 pm25_imp"
local full_window_pollutants = "junjie_api_xx pm10 pm25_imp"
d `pollutants' , f
sum `pollutants'

local key_outcomes = "log_NumCall log_minutes_on_phone log_calls_per_phone_minutes"
d `key_outcomes' , f
sum `key_outcomes'

local key_outcomes_levels = "NumCall minutes_on_phone calls_per_phone_minutes"
d `key_outcomes_levels' , f
sum `key_outcomes_levels'

estimates clear
}

************************************************************
************************************************************
**   Analysis Section Below
************************************************************
************************************************************

************************************************************
**   Do bootstrap...
************************************************************

if "`step'" != "4" {
preserve
	keep if (shanghai_prod_sample == 1 | nantong_prod_sample == 1)

	** Creating single local API measure **
	gen api=junjie_api_xx
	replace api=nantong_shuang_api_pm10 if  nantong_prod_sample==1

	gen byte api_0_5 = (api > 0 & api <= 5 )
	gen byte api_5_10 = (api > 5 & api <= 10 )
	gen byte api_10_15 = (api > 10 & api <= 15 )
	gen byte api_15_20 = (api > 15 & api <= 20 )
	gen byte api_20_p = (api > 20 & api < . )

	** Creating single local temp measure **
	gen ltemp=tmax_fah
	replace ltemp=nantong_temp if nantong_prod_sample==1
	gen ltemp2 = ltemp*ltemp

	egen intersection = group(EmployeeID Date)

	tempfile working
	save `working'

	** RESULTS: Linear productivity, original SE's, merged sample
	foreach outcome in log_NumCall log_minutes_logged {

		use `working' , clear
		matrix q_estimates = J(15, 2, .)

		forvalues q = 1(1)9 {

			timer clear 1
			timer on 1 

			disp "Now on outcome `outcome', quantile 0.`q'"
			if "`step'" == "1" {
				bootstrap, cluster(Date) reps(50): qreg `outcome' api hotel ltemp ltemp2 ym_* dow_* , q(0.`q'0)
			}
			if "`step'" == "2" {
				bootstrap, cluster(EmployeeID) reps(50): qreg `outcome' api hotel ltemp ltemp2 ym_* dow_* , q(0.`q'0)
			}
			if "`step'" == "3" {
				bootstrap, cluster(intersection) reps(50): qreg `outcome' api hotel ltemp ltemp2 ym_* dow_* , q(0.`q'0)
			}
			matrix q_estimates[`q',1] = _b[api]
			matrix q_estimates[`q',2] = _se[api]

			timer off 1
			qui timer list
			display "For `outcome', quantile 0.`q', time elapsed: " `r(t1)'/(60 * 60) " hours. Or equivalently " `r(t1)'/(60) " minutes."
		}

		drop _all
		svmat q_estimates

		rename q_estimates1 beta_step`step'
		rename q_estimates2 se_step`step'
		gen quantile = _n / 10
		drop if beta_step`step' == .

		save ../dta/quantile-regressions-step`step'-`outcome'.dta , replace

	}

restore
}

************************************************************
**   Put quantile results together, FP
************************************************************

if "`step'" == "4" {
foreach outcome in log_NumCall log_minutes_logged {

	use ../dta/quantile-regressions-step1-`outcome'.dta , clear
	merge 1:1 quantile using ../dta/quantile-regressions-step2-`outcome'.dta 
	drop _merge
	merge 1:1 quantile using ../dta/quantile-regressions-step3-`outcome'.dta 
	drop _merge

	list , clean

	gen lower = beta_step1 - 1.98 * (se_step1^2 + se_step2^2 - se_step3^2)^(0.5)
	gen upper = beta_step1 + 1.98 * (se_step1^2 + se_step2^2 - se_step3^2)^(0.5)

	format beta_step1 %9.3f
	format lower %9.3f
	format upper %9.3f

	local inv_golden_ratio = 2 / ( sqrt(5) + 1 )
	graph set window fontface "Garamond" 
	twoway ///
	(connected beta_step1 quantile, lpattern(solid) lcolor(gray) mcolor(blue) msymbol(o)) ///
	(connected lower quantile, lpattern(dash) lcolor(gray) msymbol(i)) ///
	(connected upper quantile, lpattern(dash) lcolor(gray) msymbol(i)) ///
	, ///
	ylabel(, nogrid angle(horizontal)  ) ///
	scheme(s2mono)  ///
	aspectratio(`inv_golden_ratio') ///
	graphregion(fcolor(white))  ///
	legend(off) ///
	yline(0) ///
	yscale( nofextend ) xscale(nofextend) ///
	xtitle(" ") ///
	ytitle(" ")

	graph save ../gph/quantilecluster-effects-`outcome'.gph , replace

}
}

************************************************************
**   Quantile Regressions, original version
************************************************************

** This is the old quantile regression code...
** This takes *forever* to run, so I comment this out for now...

*** preserve
*** 	keep if (shanghai_prod_sample == 1 | nantong_prod_sample == 1)
*** 
*** 	** Creating single local API measure **
*** 	gen api=junjie_api_xx
*** 	replace api=nantong_shuang_api_pm10 if  nantong_prod_sample==1
*** 
*** 	gen byte api_0_5 = (api > 0 & api <= 5 )
*** 	gen byte api_5_10 = (api > 5 & api <= 10 )
*** 	gen byte api_10_15 = (api > 10 & api <= 15 )
*** 	gen byte api_15_20 = (api > 15 & api <= 20 )
*** 	gen byte api_20_p = (api > 20 & api < . )
*** 
*** 	** Creating single local temp measure **
*** 	gen ltemp=tmax_fah
*** 	replace ltemp=nantong_temp if nantong_prod_sample==1
*** 	gen ltemp2 = ltemp*ltemp
*** 
*** 	tempfile working
*** 	save `working'
*** 
*** 	** RESULTS: Linear productivity, original SE's, merged sample
*** 	foreach outcome in log_NumCall log_minutes_logged {
*** 
*** 		use `working' , clear
*** 		matrix q_estimates = J(15, 3, .)
*** 
*** 		forvalues q = 1(1)9 {
*** 
*** 			timer clear 1
*** 			timer on 1 
*** 
*** 			disp "Now on outcome `outcome', quantile 0.`q'"
*** 			bootstrap, cluster(Date) reps(50): qreg `outcome' api hotel ltemp ltemp2 ym_* dow_* , q(0.`q'0)
*** 			matrix q_estimates[`q',1] = _b[api]
*** 			matrix q_estimates[`q',2] = _b[api] - 1.98 * _se[api]
*** 			matrix q_estimates[`q',3] = _b[api] + 1.98 * _se[api]
*** 
*** 			timer off 1
*** 			qui timer list
*** 			display "For `outcome', quantile 0.`q', time elapsed: " `r(t1)'/(60 * 60) " hours. Or equivalently " `r(t1)'/(60) " minutes."
*** 		}
*** 
*** 		drop _all
*** 		svmat q_estimates
*** 
*** 		rename q_estimates1 beta
*** 		rename q_estimates2 lower
*** 		rename q_estimates3 upper
*** 		gen quantile = _n / 10
*** 		drop if beta == .
*** 
*** 		format beta %9.3f
*** 		format lower %9.3f
*** 		format upper %9.3f
*** 
*** 		local inv_golden_ratio = 2 / ( sqrt(5) + 1 )
*** 		graph set window fontface "Garamond" 
*** 		twoway ///
*** 		(connected beta quantile, lpattern(solid) lcolor(gray) mcolor(blue) msymbol(o)) ///
*** 		(connected lower quantile, lpattern(dash) lcolor(gray) msymbol(i)) ///
*** 		(connected upper quantile, lpattern(dash) lcolor(gray) msymbol(i)) ///
*** 		, ///
*** 		ylabel(, nogrid angle(horizontal)  ) ///
*** 		scheme(s2mono)  ///
*** 		aspectratio(`inv_golden_ratio') ///
*** 		graphregion(fcolor(white))  ///
*** 		legend(off) ///
*** 		yline(0) ///
*** 		yscale( nofextend ) xscale(nofextend) ///
*** 		xtitle(" ") ///
*** 		ytitle(" ")
*** 
*** 		graph save ../gph/quantile-effects-`outcome'.gph , replace
*** 	}
*** 
*** restore

log close 
exit

